This R script is used to validate the points in the ReSurvey database using RS indicators (indices + phenology + canopy height).
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)
G3;here() starts at C:/Users/jimenezalfaro/OneDrive - Universidad de Oviedo/IMIB/Analyses/MOTIVATE/MOTIVATE_validation
g
library(gridExtra)
G3;
Adjuntando el paquete: ‘gridExtra’
gG3;The following object is masked from ‘package:dplyr’:
combine
g
library(readxl)
library(scales)
G3;
Adjuntando el paquete: ‘scales’
gG3;The following object is masked from ‘package:purrr’:
discard
gG3;The following object is masked from ‘package:readr’:
col_factor
g
library(sf)
G3;Linking to GEOS 3.13.1, GDAL 3.10.2, PROJ 9.5.1; sf_use_s2() is TRUE
g
library(rnaturalearth)
library(dtplyr)
G3;Registered S3 method overwritten by 'data.table':
method from
print.data.table
g
library(lme4)
G3;Cargando paquete requerido: Matrix
gG3;
Adjuntando el paquete: ‘Matrix’
gG3;The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
g
library(lmerTest)
G3;
Adjuntando el paquete: ‘lmerTest’
gG3;The following object is masked from ‘package:lme4’:
lmer
gG3;The following object is masked from ‘package:stats’:
step
g
library(car)
G3;Cargando paquete requerido: carData
gG3;
Adjuntando el paquete: ‘car’
gG3;The following object is masked from ‘package:dplyr’:
recode
gG3;The following object is masked from ‘package:purrr’:
some
g
library(ggeffects)
library(party)
G3;Cargando paquete requerido: grid
gG3;Cargando paquete requerido: mvtnorm
gG3;Cargando paquete requerido: modeltools
gG3;Cargando paquete requerido: stats4
gG3;
Adjuntando el paquete: ‘modeltools’
gG3;The following object is masked from ‘package:car’:
Predict
gG3;The following object is masked from ‘package:lme4’:
refit
gG3;Cargando paquete requerido: strucchange
gG3;Cargando paquete requerido: zoo
gG3;
Adjuntando el paquete: ‘zoo’
gG3;The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
gG3;Cargando paquete requerido: sandwich
gG3;
Adjuntando el paquete: ‘strucchange’
gG3;The following object is masked from ‘package:stringr’:
boundary
gG3;
Adjuntando el paquete: ‘party’
gG3;The following object is masked from ‘package:dplyr’:
where
g
library(partykit)
G3;Cargando paquete requerido: libcoin
gG3;
Adjuntando el paquete: ‘partykit’
gG3;The following objects are masked from ‘package:party’:
cforest, ctree, ctree_control, edge_simple, mob, mob_control,
node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
node_terminal, varimp
g
library(moreparty)
G3;Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
g
library(doParallel)
G3;Cargando paquete requerido: foreach
gG3;
Adjuntando el paquete: ‘foreach’
gG3;The following objects are masked from ‘package:purrr’:
accumulate, when
gG3;Cargando paquete requerido: iterators
gG3;Cargando paquete requerido: parallel
g
library(strucchange)
library(ggparty)
G3;
Adjuntando el paquete: ‘ggparty’
gG3;The following object is masked from ‘package:ggeffects’:
get_predictions
g
library(caret)
G3;Cargando paquete requerido: lattice
gG3;
Adjuntando el paquete: ‘caret’
gG3;The following object is masked from ‘package:purrr’:
lift
g
library(moreparty)
library(randomForest)
G3;randomForest 4.7-1.2
gG3;Type rfNews() to see new features/changes/bug fixes.
gG3;
Adjuntando el paquete: ‘randomForest’
gG3;The following object is masked from ‘package:gridExtra’:
combine
gG3;The following object is masked from ‘package:dplyr’:
combine
gG3;The following object is masked from ‘package:ggplot2’:
margin
g
library(pROC)
G3;Type 'citation("pROC")' for a citation.
gG3;
Adjuntando el paquete: ‘pROC’
gG3;The following objects are masked from ‘package:stats’:
cov, smooth, var
g
library(corrplot)
G3;corrplot 0.95 loaded
g
library(rlang)
G3;
Adjuntando el paquete: ‘rlang’
gG3;The following objects are masked from ‘package:purrr’:
%@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
flatten_raw, invoke, splice
g
library(stringr)
library(beepr)
library(foreach)
library(permimp)
printall <- function(tibble) {
print(tibble, width = Inf)
}
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
# Define the folder path
folder_path <- here("objects", "10")
# List all .RData or .rda files in the folder
rdata_files <- list.files(folder_path, full.names = TRUE)
# Load each file
lapply(rdata_files, load, envir = .GlobalEnv)
[[1]]
[1] "predictions_rf0_S2"
[[2]]
[1] "predictions_rf1_S2"
[[3]]
[1] "predictions_rf2_S2"
[[4]]
[1] "predictions_rf3_S2"
[[5]]
[1] "predictions_rf4_S2"
[[6]]
[1] "predictions_rf5_S2"
[[7]]
[1] "rf0_S2"
[[8]]
[1] "rf1_S2"
[[9]]
[1] "rf2_S2"
[[10]]
[1] "rf3_S2"
[[11]]
[1] "rf4_S2"
[[12]]
[1] "rf5_S2"
[[13]]
[1] "roc_data0_S2"
[[14]]
[1] "roc_data1_S2"
[[15]]
[1] "roc_data2_S2"
[[16]]
[1] "roc_data3_S2"
[[17]]
[1] "roc_data4_S2"
[[18]]
[1] "roc_data5_S2"
[[19]]
[1] "varimp_rf0_S2"
data_validation<-read_tsv(here("data", "clean","final_RS_data_bands_S2_all.csv"))
Rows: 32661 Columns: 56
── Column specification ────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): EUNISa_1, EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, biogeo, unit, Lctnmth
dbl (49): PlotObservationID, EVI_max, NDVI_max, SAVI_max, EVI_min, NDVI_min, SAVI_mi...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
No parsing issues!
Do when all RS data is ready!
# Define a function to create histograms
plot_histogram <- function(data, x_var, x_label) {
ggplot(data %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = !!sym(x_var))) +
geom_histogram(color = "black", fill = "white") +
labs(x = x_label, y = "Frequency") +
theme_bw()
}
# Define a function to create plots with violin + boxplot + points
distr_plot <- function(data, y_vars, y_labels) {
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
p <- ggplot(data = data %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNIS level 1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip()
print(p)
}
}
Ranges of min and max:
range(data_validation$NDVI_max, na.rm = T) # NDVI_max > 1 (slightly)
[1] 0.06064533 1.08505435
range(data_validation$NDMI_max, na.rm = T) # NDMI_max > 1 (slightly)
[1] -0.2202634 1.0960134
range(data_validation$NDWI_max, na.rm = T)
[1] -0.7291356 0.5942999
range(data_validation$SAVI_max, na.rm = T)
[1] 0.02386576 0.85981369
range(data_validation$EVI_max, na.rm = T) # EVI_max > 1 (slightly)
[1] 0.0475601 1.1938362
range(data_validation$NDVI_min, na.rm = T)
[1] -0.7058125 0.8423388
range(data_validation$NDMI_min, na.rm = T)
[1] -0.5601429 0.5758041
range(data_validation$NDWI_min, na.rm = T) # NDWI_min < -1 (slightly)
[1] -1.014830371 -0.008258131
range(data_validation$SAVI_min, na.rm = T)
[1] -0.5367437 0.6335289
range(data_validation$EVI_min, na.rm = T) # EVI_min < -1!
[1] -1.5674414 0.7891647
nrow(data_validation %>% dplyr::filter(NDVI_max > 1))
[1] 18
nrow(data_validation %>% dplyr::filter(NDMI_max > 1))
[1] 51
nrow(data_validation %>% dplyr::filter(EVI_max > 1))
[1] 9
nrow(data_validation %>% dplyr::filter(NDWI_min < -1))
[1] 4
nrow(data_validation %>% dplyr::filter(EVI_min < -1))
[1] 10
Histograms to check that max and min values are ok:
plot_histogram(data_validation, "NDVI_max", "NDVI max")
plot_histogram(data_validation, "NDMI_max", "NDMI max")
plot_histogram(data_validation, "NDWI_max", "NDWI max")
plot_histogram(data_validation, "SAVI_max", "SAVI max")
plot_histogram(data_validation, "EVI_max", "EVI max")
plot_histogram(data_validation, "NDVI_min", "NDVI min")
plot_histogram(data_validation, "NDMI_min", "NDMI min")
plot_histogram(data_validation, "NDWI_min", "NDWI min")
plot_histogram(data_validation, "SAVI_min", "SAVI min")
plot_histogram(data_validation, "EVI_min", "EVI min")
nrow(data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(EVI_max > 1 | EVI_max < -1))
[1] 9
data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q"))%>%
dplyr::filter(EVI_max > 1 | EVI_max < -1) %>%
count(biogeo, unit)
Most EVI values are ok!
Distribution plots:
distr_plot(data_validation %>% dplyr::filter(EVI_min > -0.5),
c("NDVI_max", "EVI_max", "SAVI_max", "NDMI_max", "NDWI_max",
"NDVI_min", "EVI_min", "SAVI_min", "NDMI_min", "NDWI_min"),
c("NDVI_max", "EVI_max", "SAVI_max", "NDMI_max", "NDWI_max",
"NDVI_min", "EVI_min", "SAVI_min", "NDMI_min", "NDWI_min"))
distr_plot(data_validation, "canopy_height", "Canopy height (m)")
ggplot(data_validation %>%
# Keep only forests, grasslands, shrublands and wetlands
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
mutate(CH_cat =
factor(
case_when(canopy_height == 0 ~ "0 m",
canopy_height > 0 & canopy_height <= 1 ~ "0-1 m",
canopy_height > 1 & canopy_height <=2 ~ "1-2 m",
canopy_height > 2 & canopy_height <=5 ~ "2-5 m",
canopy_height > 5 & canopy_height <=8 ~ "5-8 m",
canopy_height > 8 ~ "> 8 m",
is.na(canopy_height) ~ NA_character_),
levels = c(
"0 m", "0-1 m", "1-2 m", "2-5 m", "5-8 m", "> 8 m"))),
aes(x = EUNISa_1_descr, fill = CH_cat)) +
geom_bar() + theme_bw() + coord_flip() +
scale_y_continuous(labels = label_number()) +
scale_fill_viridis_d(direction = -1) +
labs(x = "EUNIS level 1", fill = "Canopy height") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
theme(legend.position = c(0.8, 0.75),
legend.direction = "vertical")
data_validation %>%
# Keep only forests, grasslands, shrublands and wetlands
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
group_by(EUNISa_1_descr) %>%
summarise(across(canopy_height, list(
mean = mean,
median = median,
sd = sd,
min = min,
max = max
), na.rm = TRUE))
Only using NDVI- and SAVI-based values so far.
Maximum NDVI should be equal to value at peak?
nrow(data_validation %>% dplyr::filter(NDVI_pos_value != NDVI_max))
[1] 590
Not sure why this happens, but in most cases the difference is small (< -0.1). Anyway, we should use only one of these two in the RF models.
plot_histogram(data_validation, "NDVI_sos_doy", "NDVI_sos_doy")
plot_histogram(data_validation, "NDVI_pos_doy", "NDVI_pos_doy")
plot_histogram(data_validation, "NDVI_eos_doy", "NDVI_eos_doy")
plot_histogram(data_validation, "EVI_sos_doy", "EVI_sos_doy")
plot_histogram(data_validation, "EVI_pos_doy", "EVI_pos_doy")
plot_histogram(data_validation, "EVI_eos_doy", "EVI_eos_doy")
plot_histogram(data_validation, "SAVI_sos_doy", "SAVI_sos_doy")
plot_histogram(data_validation, "SAVI_pos_doy", "SAVI_pos_doy")
plot_histogram(data_validation, "SAVI_eos_doy", "SAVI_eos_doy")
plot_histogram(data_validation, "NDVI_sos_value", "NDVI_sos_value")
plot_histogram(data_validation, "NDVI_pos_value", "NDVI_pos_value")
plot_histogram(data_validation, "NDVI_eos_value", "NDVI_eos_value")
plot_histogram(data_validation, "EVI_sos_value", "EVI_sos_value")
plot_histogram(data_validation, "EVI_pos_value", "EVI_pos_value")
plot_histogram(data_validation, "EVI_eos_value", "EVI_eos_value")
plot_histogram(data_validation, "NDVI_gsd", "NDVI_gsd")
plot_histogram(data_validation, "EVI_gsd", "EVI_gsd")
plot_histogram(data_validation, "SAVI_gsd", "SAVI_gsd")
plot_histogram(data_validation, "NDVI_diff_pos_sos_value",
"NDVI_diff_pos_sos_value")
plot_histogram(data_validation, "EVI_diff_pos_sos_value",
"EVI_diff_pos_sos_value")
plot_histogram(data_validation, "SAVI_diff_pos_sos_value",
"SAVI_diff_pos_sos_value")
plot_histogram(data_validation, "NDVI_diff_pos_eos_value",
"NDVI_diff_pos_eos_value")
plot_histogram(data_validation, "EVI_diff_pos_eos_value",
"EVI_diff_pos_eos_value")
plot_histogram(data_validation, "SAVI_diff_pos_eos_value",
"SAVI_diff_pos_eos_value")
plot_histogram(data_validation, "NDVI_diff_pos_sos_doy",
"NDVI_diff_pos_sos_doy")
plot_histogram(data_validation, "EVI_diff_pos_sos_doy",
"EVI_diff_pos_sos_doy")
plot_histogram(data_validation, "SAVI_diff_pos_sos_doy",
"SAVI_diff_pos_sos_doy")
plot_histogram(data_validation, "NDVI_diff_eos_pos_doy",
"NDVI_diff_eos_pos_doy")
plot_histogram(data_validation, "EVI_diff_eos_pos_doy",
"EVI_diff_eos_pos_doy")
plot_histogram(data_validation, "SAVI_diff_eos_pos_doy",
"SAVI_diff_eos_pos_doy")
plot_histogram(data_validation, "NDVI_auc", "NDVI_auc")
plot_histogram(data_validation, "EVI_auc", "EVI_auc")
plot_histogram(data_validation, "SAVI_auc", "SAVI_auc")
distr_plot(data_validation,
c("NDVI_sos_value","NDVI_pos_value", "NDVI_eos_value",
"EVI_sos_value","EVI_pos_value", "EVI_eos_value",
"SAVI_sos_value", "SAVI_pos_value", "SAVI_eos_value",
"NDVI_sos_doy","NDVI_pos_doy", "NDVI_eos_doy",
"EVI_sos_doy","EVI_pos_doy", "EVI_eos_doy",
"SAVI_sos_doy", "SAVI_pos_doy", "SAVI_eos_doy"),
c("NDVI_sos_value","NDVI_pos_value", "NDVI_eos_value",
"EVI_sos_value","EVI_pos_value", "EVI_eos_value",
"SAVI_sos_Value", "SAVI_pos_value", "SAVI_eos_value",
"NDVI_sos_doy","NDVI_pos_doy", "NDVI_eos_doy",
"EVI_sos_doy","EVI_pos_doy", "EVI_eos_doy",
"SAVI_sos_doy", "SAVI_pos_doy", "SAVI_eos_doy")
)
distr_plot(data_validation,
c("NDVI_gsd","EVI_gsd", "SAVI_gsd",
"NDVI_diff_pos_sos_value", "EVI_diff_pos_sos_value",
"SAVI_diff_pos_sos_value",
"NDVI_diff_pos_eos_value", "EVI_diff_pos_eos_value",
"SAVI_diff_pos_eos_value",
"NDVI_diff_pos_sos_doy", "EVI_diff_pos_sos_doy",
"SAVI_diff_pos_sos_doy",
"NDVI_diff_eos_pos_doy", "EVI_diff_eos_pos_doy",
"SAVI_diff_eos_pos_doy"),
c("NDVI_gsd","EVI_gsd", "SAVI_gsd",
"NDVI_diff_pos_sos_value", "EVI_diff_pos_sos_value",
"SAVI_diff_pos_sos_Value",
"NDVI_diff_pos_eos_value", "EVI_diff_pos_eos_value",
"SAVI_diff_pos_eos_value",
"NDVI_diff_pos_sos_doy", "EVI_diff_pos_sos_doy",
"SAVI_diff_pos_sos_doy",
"NDVI_diff_eos_pos_doy", "EVI_diff_eos_pos_doy",
"SAVI_diff_eos_pos_doy")
)
distr_plot(data_validation,
c("NDVI_auc", "EVI_auc", "SAVI_auc"),
c("NDVI_auc", "EVI_auc", "SAVI_auc"))
# Define a function to create plots with violin + boxplot + points
distr_plot_biogeo <- function(data, y_vars, y_labels) {
plots <- list()
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
p <- ggplot(data = data %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNISa_1_descr") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip() + facet_wrap(~ biogeo)
plots[[y_var]] <- p
}
return(plots)
}
Distribution plots:
distr_plot_biogeo(data_validation, "canopy_height", "Canopy height (m)")
$canopy_height
RF models fitted using the conditional inference version of random forest (first cforest in package party, now fastcforest in package moreparty). Suggested if the data are highly correlated. Cforest is more stable in deriving variable importance values in the presence of highly correlated variables, thus providing better accuracy in calculating variable importance (ref below).
Hothorn, T., Hornik, K. and Zeileis, A. (2006) Unbiased Recursive Portioning: A Conditional Inference Framework. Journal of Computational and Graphical Statistics, 15, 651- 674. http://dx.doi.org/10.1198/106186006X133933
Choose the hyperparameter mtry based on the square root of the number of predictor variables:
Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements of statistical learning: Data mining, inference, and prediction. Springer Science & Business Media.
Maybe TO_DO: We variated ntree from 50 to 800 in steps of 50, leaving mtry constant at 2. Tis parameter variation showed that ntree=500 was optimal, while higher ntree led to no further model improvement (Supplementary Fig. S10). Subsequently, the hyperparameter mtry was varied from 2 to 8 with constant ntree=500. Here, mtry=3 led to the best results in almost all cases (Supplementary Fig. S11). Consequently, we chose ntree=500 and mtry=3 for our main analysis across all study sites.
Define a function to run fastcforest models:
run_rf <- function(vars_RF, train_data, response_var, ntree = 500)
{
# Detect and register available cores (leave one free)
n_cores <- parallel::detectCores() - 1
cl <- makeCluster(n_cores)
registerDoParallel(cl)
train_name <- deparse(substitute(train_data))
# Export necessary variables to the cluster
clusterExport(cl, varlist = c("vars_RF", train_name))
# Set seed for reproducibility
set.seed(123)
# Measure execution time
execution_time <- system.time({
rf_model <- fastcforest(
formula = reformulate(vars_RF, response = response_var),
data = train_data,
controls = party::cforest_control(
mtry = round(sqrt(length(vars_RF))),
ntree = ntree
),
parallel = TRUE
)
})
# Stop the cluster
stopCluster(cl)
# Return both the model and execution time
list(model = rf_model, time = execution_time)
}
compute_varimp <- function(model, nperm = 100,
n_cores = parallel::detectCores() - 1) {
# Set up parallel backend
cl <- makeCluster(n_cores)
registerDoParallel(cl)
# Measure execution time
execution_time <- system.time({
varimp_list <- foreach(i = 1:nperm, .combine = '+',
.packages = "party") %dopar% {
varimp(model, conditional = FALSE, nperm = 1)
}
})
stopCluster(cl)
# Average the results
varimp_avg <- varimp_list / nperm
return(list(varimp = varimp_avg, time = execution_time))
}
Using permimp() en permimp package: https://cran.r-project.org/web/packages/permimp/vignettes/permimp-package.html#fn1
compute_varimp <- function(model, nperm = 100) {
# Measure execution time
execution_time <- system.time({
varimp_result <- permimp(model, conditional = FALSE, progressBar = TRUE)
})
return(list(varimp = varimp_result, time = execution_time))
}
compute_varimp_cond <- function(model, nperm = 100) {
# Measure execution time
execution_time <- system.time({
varimp_result <- permimp(model, conditional = TRUE, progressBar = TRUE)
})
return(list(varimp = varimp_result, time = execution_time))
}
compute_roc_level1 <- function(model, test_data) {
# Measure execution time
execution_time <- system.time({
# Step 1: Predict probabilities
probabilities <- predict(model, newdata = test_data, type = "prob")
# Step 2: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
prob_df <- as.data.frame(prob_matrix)
colnames(prob_df) <- c("Q", "R", "S", "T")
# Step 3: Prepare actual class labels
actual <- factor(test_data$EUNISa_1, levels = c("Q", "R", "S", "T"))
# Step 4: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 5: Compute ROC data for each class
roc_data <- lapply(levels(actual), function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
})
# Return both ROC data and execution time
return(list(roc = roc_data, time = execution_time))
}
compute_roc_level2 <- function(model, test_data) {
# Measure execution time
execution_time <- system.time({
# Step 1: Predict probabilities
probabilities <- predict(model, newdata = test_data, type = "prob")
# Step 2: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
prob_df <- as.data.frame(prob_matrix)
colnames(prob_df) <- c("Q1", "Q2", "Q4", "Q5", "R1", "R2", "R3", "R4", "R5",
"R6", "S3", "S4", "T1", "T3")
# Step 3: Prepare actual class labels
actual <- factor(test_data$EUNISa_2,
levels = c("Q1", "Q2", "Q4", "Q5", "R1", "R2", "R3", "R4",
"R5", "R6", "S3", "S4", "T1", "T3"))
# Step 4: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 5: Compute ROC data for each class
roc_data <- lapply(levels(actual), function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
})
# Return both ROC data and execution time
return(list(roc = roc_data, time = execution_time))
}
vars_RF <- c(
# Min values of all indices
"NDVI_min", "EVI_min", "NDMI_min", "NDWI_min", "SAVI_min",
# Max values of NDMI and NDWI
"NDMI_max", "NDWI_max",
# AUC of NDVI, EVI and SAVI
"NDVI_auc", "EVI_auc", "SAVI_auc",
# Values of NDVI, EVI and SAVI at sos, pos and eos
"NDVI_sos_value", "NDVI_pos_value", "NDVI_eos_value",
"EVI_sos_value", "EVI_pos_value", "EVI_eos_value",
"SAVI_sos_value", "SAVI_pos_value", "SAVI_eos_value",
# Differences pos-sos in value and doy
"NDVI_diff_pos_sos_value", "EVI_diff_pos_sos_value", "SAVI_diff_pos_sos_value",
"NDVI_diff_pos_sos_doy", "EVI_diff_pos_sos_doy", "SAVI_diff_pos_sos_doy",
# Differences pos-eos in value and doy
"NDVI_diff_pos_eos_value", "EVI_diff_pos_eos_value","SAVI_diff_pos_eos_value",
"NDVI_diff_eos_pos_doy", "EVI_diff_eos_pos_doy", "SAVI_diff_eos_pos_doy",
# Growing season duration
"NDVI_gsd", "EVI_gsd", "SAVI_gsd",
# Canopy height
"canopy_height")
filtered_data0 <- data_validation %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF), ~ !is.na(.))) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0)) %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF))
Correlation of all variables to be included in RF models:
corrplot(filtered_data0 %>%
dplyr::select(all_of(vars_RF)) %>%
cor(use = "pairwise.complete.obs"),
method = "color", type = "upper", tl.col = "black", tl.srt = 45)
# Compute correlation matrix
cor_matrix <- filtered_data0 %>%
dplyr::select(all_of(vars_RF)) %>%
cor(use = "pairwise.complete.obs")
# Get the upper triangle of the matrix without the diagonal
cor_matrix[lower.tri(cor_matrix, diag = TRUE)] <- NA
# Find variable pairs with correlation > 0.7 or < -0.7
high_corr_pairs <- which(abs(cor_matrix) > 0.7, arr.ind = TRUE)
# Create a data frame of the results
cor_pairs_df <- data.frame(
Var1 = rownames(cor_matrix)[high_corr_pairs[, 1]],
Var2 = colnames(cor_matrix)[high_corr_pairs[, 2]],
Correlation = cor_matrix[high_corr_pairs]
)
# View the result
print(cor_pairs_df)
Split filtered_data0 into training and test data sets.
set.seed(123)
train_indices0 <- sample(1:nrow(filtered_data0), 0.7 * nrow(filtered_data0))
train_data0 <- filtered_data0[train_indices0, ]
test_data0 <- filtered_data0[-train_indices0, ]
Number of points per category for filtered data:
filtered_data0 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf0_S2, test_data0$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 963 295 140 16
R 795 4214 448 124
S 216 187 869 42
T 25 81 52 659
Overall Statistics
Accuracy : 0.7347
95% CI : (0.7255, 0.7438)
No Information Rate : 0.5234
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5679
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.4817 0.8821 0.57588 0.78359
Specificity 0.9367 0.6857 0.94158 0.98093
Pos Pred Value 0.6810 0.7551 0.66134 0.80661
Neg Pred Value 0.8657 0.8412 0.91807 0.97810
Prevalence 0.2190 0.5234 0.16535 0.09215
Detection Rate 0.1055 0.4618 0.09522 0.07221
Detection Prevalence 0.1549 0.6115 0.14398 0.08952
Balanced Accuracy 0.7092 0.7839 0.75873 0.88226
Variable Importance Plot
plot(varimp_rf0_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc0 <- ggplot(roc_data0_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
Warning :Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_]8;;ide:run:warnings()warnings()]8;;` to see where this warning was generated.
roc0
filtered_data1 <- data_validation %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF), ~ !is.na(.))) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0)) %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF))
Split into training and test data sets.
set.seed(123)
train_indices1 <- sample(1:nrow(filtered_data1), 0.7 * nrow(filtered_data1))
train_data1 <- filtered_data1[train_indices1, ]
test_data1 <- filtered_data1[-train_indices1, ]
Number of points per category for filtered data:
filtered_data1 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf1_S2, test_data1$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1077 341 219 13
R 537 2535 336 69
S 201 158 835 15
T 16 24 15 90
Overall Statistics
Accuracy : 0.7
95% CI : (0.6887, 0.7112)
No Information Rate : 0.4718
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5268
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5882 0.8290 0.5943 0.48128
Specificity 0.8768 0.7248 0.9263 0.99126
Pos Pred Value 0.6527 0.7291 0.6907 0.62069
Neg Pred Value 0.8439 0.8259 0.8919 0.98469
Prevalence 0.2825 0.4718 0.2168 0.02885
Detection Rate 0.1662 0.3911 0.1288 0.01389
Detection Prevalence 0.2546 0.5365 0.1865 0.02237
Balanced Accuracy 0.7325 0.7769 0.7603 0.73627
Variable Importance Plot
plot(varimp_rf1_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf1_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc1 <- ggplot(roc_data1_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc1
Define a set of rules for a first validation of ALL ReSurvey data (“Expert-based” rules). Not very ambitious, only taking out observations that are clearly wrong on the basis of indicator values.
Create column for first validation based on different indicators, where “wrong” is noted when the validation rule is not met.
Define rules:
data_validation <-
data_validation %>%
mutate(
valid_1_NDWI = case_when(
# Points that are basically water
NDWI_max > 0.3 ~ "wrong",
TRUE ~ NA_character_),
valid_1_CH = case_when(
# R & Q points with high CH
EUNISa_1 %in% c("R", "Q") & canopy_height > 2 ~ "wrong",
# T points with low CH
EUNISa_1 == "T" & canopy_height < 3 ~ "wrong",
# S points with high CH
EUNISa_1 == "S" & canopy_height > 3 ~ "wrong",
TRUE ~ NA_character_),
# No rules for NDVI, we will take out observations based on distributions
# Count how many validation rules are not met
valid_1_count = rowSums(across(c(valid_1_NDWI, valid_1_CH), ~ . == "wrong"),
na.rm = TRUE),
# Points where at least 1 rule not met
valid_1 = if_else(valid_1_count > 0, "At least 1 rule broken",
"No rules broken so far")
)
ggplot(data_validation%>%
mutate(rules_broken = case_when(
valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
valid_1_count == 2 &
valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
TRUE ~ NA_character_
)),
aes(x = valid_1_count, fill = rules_broken)) +
geom_bar() + labs(x = "Number of broken rules")
Proportion of observations not validated (so far):
nrow(data_validation %>% dplyr::filter(valid_1_count > 0))/
nrow(data_validation)
[1] 0.0866783
But be aware that there are still MANY missing RS data.
ggplot(data_validation %>%
mutate(diff_GPS = if_else(
Lctnmth != "Location with differential GPS" |
is.na(Lctnmth), "no", "yes")),
aes(x = diff_GPS, fill = valid_1)) +
geom_bar() + labs(x = "Differential GPS")
ggplot(data_validation %>%
mutate(GPS = case_when(
Lctnmth == "Location with differential GPS" ~ "yes",
Lctnmth == "Location with GPS" ~ "yes",
is.na(Lctnmth) ~ "no",
TRUE ~ "no"
)),
aes(x = GPS, fill = valid_1)) +
geom_bar() + labs(x = "GPS")
filtered_data2 <- data_validation %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF), ~ !is.na(.))) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0)) %>%
# Filter out points that have not passed the rough validation
filter(valid_1 == "No rules broken so far") %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF))
Split into training and test data sets.
set.seed(123)
train_indices2 <- sample(1:nrow(filtered_data2), 0.7 * nrow(filtered_data2))
train_data2 <- filtered_data2[train_indices2, ]
test_data2 <- filtered_data2[-train_indices2, ]
Number of points per category for filtered data:
filtered_data2 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf2_S2, test_data2$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 936 230 144 1
R 707 4007 399 4
S 227 142 818 2
T 0 0 2 720
Overall Statistics
Accuracy : 0.7772
95% CI : (0.7681, 0.7861)
No Information Rate : 0.5251
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6357
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5005 0.9150 0.60015 0.99037
Specificity 0.9420 0.7197 0.94682 0.99974
Pos Pred Value 0.7140 0.7831 0.68797 0.99723
Neg Pred Value 0.8671 0.8845 0.92378 0.99908
Prevalence 0.2242 0.5251 0.16345 0.08718
Detection Rate 0.1122 0.4805 0.09809 0.08634
Detection Prevalence 0.1572 0.6136 0.14258 0.08658
Balanced Accuracy 0.7213 0.8174 0.77348 0.99505
Variable Importance Plot
plot(varimp_rf2_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf2_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc2 <- ggplot(roc_data2_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc2
filtered_data3 <- data_validation %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF), ~ !is.na(.))) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0)) %>%
# Filter out points that have not passed the rough validation
filter(valid_1 == "No rules broken so far") %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF))
Split into training and test data sets.
set.seed(123)
train_indices3 <- sample(1:nrow(filtered_data3), 0.7 * nrow(filtered_data3))
train_data3 <- filtered_data3[train_indices3, ]
test_data3 <- filtered_data3[-train_indices3, ]
Number of points per category for filtered data:
filtered_data3 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf3_S2, test_data3$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1004 298 156 1
R 574 2409 275 3
S 188 140 757 1
T 0 0 1 133
Overall Statistics
Accuracy : 0.7244
95% CI : (0.7129, 0.7357)
No Information Rate : 0.4793
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5603
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5685 0.8462 0.6367 0.96377
Specificity 0.8910 0.7245 0.9308 0.99983
Pos Pred Value 0.6881 0.7387 0.6971 0.99254
Neg Pred Value 0.8299 0.8365 0.9110 0.99914
Prevalence 0.2973 0.4793 0.2002 0.02323
Detection Rate 0.1690 0.4056 0.1274 0.02239
Detection Prevalence 0.2456 0.5490 0.1828 0.02256
Balanced Accuracy 0.7298 0.7853 0.7837 0.98180
Variable Importance Plot
plot(varimp_rf3_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf3_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc3 <- ggplot(roc_data3_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc3
filtered_data4 <- data_validation %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1), EUNISa_2 = as.factor(EUNISa_2)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF), ~ !is.na(.))) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0)) %>%
# Filter out points that have not passed the rough validation
filter(valid_1 == "No rules broken so far") %>%
# Filter out points where EUNISa_2 is NA
filter(!is.na(EUNISa_2)) %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, EUNISa_2, all_of(vars_RF))
Number of points per category for filtered data:
filtered_data4 %>% count(EUNISa_2)
filtered_data4 <- filtered_data4 %>%
# Keep only EUNISa_2 categories where there are at least 50 points
filter(EUNISa_2 %in%
(filtered_data4 %>% count(EUNISa_2) %>% filter(n >= 50))$EUNISa_2) %>%
# Drop unused levels of EUNISa_2
mutate(EUNISa_2 = droplevels(EUNISa_2))
Number of points per category for filtered data:
filtered_data4 %>% count(EUNISa_2)
Split into training and test data sets.
set.seed(123)
train_indices4 <- sample(1:nrow(filtered_data4), 0.7 * nrow(filtered_data4))
train_data4 <- filtered_data4[train_indices4, ]
test_data4 <- filtered_data4[-train_indices4, ]
Confusion matrix:
confusionMatrix(predictions_rf4_S2, test_data4$EUNISa_2)
Confusion Matrix and Statistics
Reference
Prediction Q1 Q2 Q4 Q5 R1 R2 R3 R4 R5 R6 S3 S4 T1 T3
Q1 66 3 9 0 7 1 7 0 0 1 2 5 0 0
Q2 17 50 14 0 3 0 3 0 1 2 0 5 0 0
Q4 51 89 577 2 123 11 131 0 2 2 12 54 1 0
Q5 0 0 0 16 0 0 0 0 0 0 0 0 0 0
R1 73 131 250 10 1301 119 132 13 10 23 53 175 1 0
R2 2 3 3 1 17 61 12 6 5 0 1 2 0 0
R3 1 1 8 7 8 11 110 0 18 0 0 4 0 0
R4 0 0 0 0 1 0 0 3 0 0 0 0 0 0
R5 0 0 0 0 0 3 8 0 5 0 0 0 0 0
R6 0 1 0 0 0 0 0 0 0 1 0 0 0 0
S3 0 0 0 0 0 0 0 0 0 0 1 0 0 0
S4 139 118 69 3 119 9 37 0 2 7 41 787 1 0
T1 0 0 0 0 0 0 0 0 0 0 2 0 91 9
T3 0 0 0 0 0 0 0 0 0 0 0 0 5 6
Overall Statistics
Accuracy : 0.5794
95% CI : (0.566, 0.5928)
No Information Rate : 0.2975
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4618
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q1 Class: Q2 Class: Q4 Class: Q5 Class: R1 Class: R2
Sensitivity 0.18911 0.126263 0.6204 0.410256 0.8239 0.28372
Specificity 0.99294 0.990837 0.8908 1.000000 0.7344 0.98979
Pos Pred Value 0.65347 0.526316 0.5469 1.000000 0.5679 0.53982
Neg Pred Value 0.94564 0.933615 0.9170 0.995653 0.9078 0.97035
Prevalence 0.06576 0.074618 0.1752 0.007349 0.2975 0.04051
Detection Rate 0.01244 0.009422 0.1087 0.003015 0.2451 0.01149
Detection Prevalence 0.01903 0.017901 0.1988 0.003015 0.4317 0.02129
Balanced Accuracy 0.59103 0.558550 0.7556 0.705128 0.7792 0.63675
Class: R3 Class: R4 Class: R5 Class: R6 Class: S3 Class: S4
Sensitivity 0.25000 0.1363636 0.1162791 0.0277778 0.0089286 0.7626
Specificity 0.98808 0.9998108 0.9979103 0.9998103 1.0000000 0.8725
Pos Pred Value 0.65476 0.7500000 0.3125000 0.5000000 1.0000000 0.5908
Neg Pred Value 0.93579 0.9964171 0.9928180 0.9934025 0.9790803 0.9384
Prevalence 0.08291 0.0041455 0.0081025 0.0067835 0.0211042 0.1945
Detection Rate 0.02073 0.0005653 0.0009422 0.0001884 0.0001884 0.1483
Detection Prevalence 0.03166 0.0007537 0.0030149 0.0003769 0.0001884 0.2510
Balanced Accuracy 0.61904 0.5680872 0.5570947 0.5137940 0.5044643 0.8176
Class: T1 Class: T3
Sensitivity 0.91919 0.400000
Specificity 0.99789 0.999055
Pos Pred Value 0.89216 0.545455
Neg Pred Value 0.99846 0.998301
Prevalence 0.01865 0.002826
Detection Rate 0.01715 0.001131
Detection Prevalence 0.01922 0.002073
Balanced Accuracy 0.95854 0.699528
Variable Importance Plot
plot(varimp_rf4_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf4_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc4 <- ggplot(roc_data4_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc4
Refinement based on the variables: SAVI_pos_value, NDWI_min, NDMI_min, NDMI_max (later check variable importances well!).
Distribution plots:
distr_plot_percentiles <- function(data, y_vars, y_labels) {
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
# Calculate percentiles per EUNISa_1 group
percentiles <- data %>%
group_by(EUNISa_1) %>%
summarise(
p10 = quantile(.data[[y_var]], 0.1, na.rm = TRUE),
p90 = quantile(.data[[y_var]], 0.9, na.rm = TRUE),
.groups = "drop"
)
# Join percentiles back to data
data_flagged <- data %>%
left_join(percentiles, by = "EUNISa_1") %>%
mutate(outlier_flag = case_when(
.data[[y_var]] < p10 ~ "low",
.data[[y_var]] > p90 ~ "high",
TRUE ~ "mid"
))
# Filter and plot
p <- ggplot(data = data_flagged %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = .data[[y_var]])) +
geom_flat_violin(aes(fill = EUNISa_1_descr),
position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(color = ifelse(outlier_flag == "mid",
EUNISa_1_descr, "grey")),
position = position_jitter(width = 0.15), size = 1,
alpha = 0.6) +
geom_boxplot(aes(fill = EUNISa_1_descr), width = 0.2, outlier.shape = NA,
alpha = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x, na.rm = TRUE) +
0.1, label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNIS level 1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip()
print(p)
}
}
distr_plot_percentiles(data_validation %>%
# Get all points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID),
c("canopy_height", "SAVI_pos_value", "NDWI_min", "NDMI_min",
"NDMI_max"),
c("canopy_height", "SAVI_pos_value", "NDWI_min", "NDMI_min",
"NDMI_max"))
So far not using canopy_height for refinement.
Calculate percentiles:
percentiles6 <- data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID) %>%
group_by(EUNISa_1) %>%
summarize(
percentile_10_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.10, na.rm = T),
percentile_20_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.20, na.rm = T),
percentile_80_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.80, na.rm = T),
percentile_90_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.90, na.rm = T),
percentile_10_NDWI_min = quantile(NDWI_min,
probs = 0.10, na.rm = T),
percentile_20_NDWI_min = quantile(NDWI_min,
probs = 0.20, na.rm = T),
percentile_80_NDWI_min = quantile(NDWI_min,
probs = 0.80, na.rm = T),
percentile_90_NDWI_min = quantile(NDWI_min,
probs = 0.90, na.rm = T),
percentile_10_NDMI_min = quantile(NDMI_min,
probs = 0.10, na.rm = T),
percentile_20_NDMI_min = quantile(NDMI_min,
probs = 0.20, na.rm = T),
percentile_80_NDMI_min = quantile(NDMI_min,
probs = 0.80, na.rm = T),
percentile_90_NDMI_min = quantile(NDMI_min,
probs = 0.90, na.rm = T),
percentile_10_NDMI_max = quantile(NDMI_max,
probs = 0.10, na.rm = T),
percentile_20_NDMI_max = quantile(NDMI_max,
probs = 0.20, na.rm = T),
percentile_80_NDMI_max = quantile(NDMI_max,
probs = 0.80, na.rm = T),
percentile_90_NDMI_max = quantile(NDMI_max,
probs = 0.90, na.rm = T)
)
filtered_data6 <- data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID) %>%
select(PlotObservationID, EUNISa_1, all_of(vars_RF)) %>%
left_join(percentiles6, by = "EUNISa_1") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(
(SAVI_pos_value >= percentile_10_SAVI_pos_value &
SAVI_pos_value <= percentile_90_SAVI_pos_value) &
(NDWI_min >= percentile_10_NDWI_min &
NDWI_min <= percentile_90_NDWI_min) &
(NDMI_min >= percentile_10_NDMI_min &
NDMI_min <= percentile_90_NDMI_min) &
(NDMI_max >= percentile_10_NDMI_max &
NDMI_max <= percentile_90_NDMI_max)
)
Split into training and test data sets.
set.seed(123)
train_indices6 <- sample(1:nrow(filtered_data6), 0.7 * nrow(filtered_data6))
train_data6 <- filtered_data6[train_indices6, ]
test_data6 <- filtered_data6[-train_indices6, ]
Number of points per category for filtered data:
filtered_data6 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf6_S2, test_data6$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 474 175 56 0
R 238 1188 98 0
S 70 65 400 2
T 0 0 0 66
Overall Statistics
Accuracy : 0.7514
95% CI : (0.7351, 0.7672)
No Information Rate : 0.5042
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6005
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.6061 0.8319 0.7220 0.97059
Specificity 0.8873 0.7607 0.9399 1.00000
Pos Pred Value 0.6723 0.7795 0.7449 1.00000
Neg Pred Value 0.8552 0.8165 0.9329 0.99928
Prevalence 0.2761 0.5042 0.1956 0.02401
Detection Rate 0.1674 0.4195 0.1412 0.02331
Detection Prevalence 0.2489 0.5381 0.1896 0.02331
Balanced Accuracy 0.7467 0.7963 0.8309 0.98529
Variable Importance Plot
plot(varimp_rf6_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf6_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc6 <- ggplot(roc_data6_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc6
filtered_data7 <- data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID) %>%
select(PlotObservationID, EUNISa_1, all_of(vars_RF)) %>%
left_join(percentiles6, by = "EUNISa_1") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(
(SAVI_pos_value >= percentile_20_SAVI_pos_value &
SAVI_pos_value <= percentile_80_SAVI_pos_value) &
(NDWI_min >= percentile_20_NDWI_min &
NDWI_min <= percentile_80_NDWI_min) &
(NDMI_min >= percentile_20_NDMI_min &
NDMI_min <= percentile_80_NDMI_min) &
(NDMI_max >= percentile_20_NDMI_max &
NDMI_max <= percentile_80_NDMI_max)
)
Split into training and test data sets.
set.seed(123)
train_indices7 <- sample(1:nrow(filtered_data7), 0.7 * nrow(filtered_data7))
train_data7 <- filtered_data7[train_indices7, ]
test_data7 <- filtered_data7[-train_indices7, ]
Number of points per category for filtered data:
filtered_data7 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf7_S2, test_data7$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 222 60 13 0
R 97 375 15 0
S 20 12 178 0
T 0 0 0 25
Overall Statistics
Accuracy : 0.7866
95% CI : (0.7601, 0.8114)
No Information Rate : 0.4395
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6719
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.6549 0.8389 0.8641 1.00000
Specificity 0.8923 0.8035 0.9605 1.00000
Pos Pred Value 0.7525 0.7700 0.8476 1.00000
Neg Pred Value 0.8380 0.8642 0.9653 1.00000
Prevalence 0.3333 0.4395 0.2026 0.02458
Detection Rate 0.2183 0.3687 0.1750 0.02458
Detection Prevalence 0.2901 0.4789 0.2065 0.02458
Balanced Accuracy 0.7736 0.8212 0.9123 1.00000
Variable Importance Plot
plot(varimp_rf7_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf7_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc7 <- ggplot(roc_data7_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc6
# Load world boundaries
world <- ne_countries(scale = "medium", returnclass = "sf")
# Calculate the extent of the points
points_GPS_extent <- data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
dplyr::filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS") %>%
summarise(lon_min = min(Lon_updated, na.rm = TRUE),
lon_max = max(Lon_updated, na.rm = TRUE),
lat_min = min(Lat_updated, na.rm = TRUE),
lat_max = max(Lat_updated, na.rm = TRUE))
# Add padding to the extent (adjust as needed)
padding <- 2 # Adjust padding to your preference
x_limits <- c(points_GPS_extent$lon_min - padding,
points_GPS_extent$lon_max + padding)
y_limits <- c(points_GPS_extent$lat_min - padding,
points_GPS_extent$lat_max + padding)
# Create the zoomed map
ggplot() +
geom_sf(data = world, fill = "lightblue", color = "gray") +
geom_point(data = data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
dplyr::filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS"),
aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
size = 1) +
coord_sf(xlim = x_limits, ylim = y_limits) +
theme_minimal()
Number of GPS points by Country:
data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
dplyr::filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS") %>%
count(Country)
# Calculate the extent of the points
points_resurvey_extent <- data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
summarise(lon_min = min(Lon_updated, na.rm = TRUE),
lon_max = max(Lon_updated, na.rm = TRUE),
lat_min = min(Lat_updated, na.rm = TRUE),
lat_max = max(Lat_updated, na.rm = TRUE))
# Add padding to the extent (adjust as needed)
padding <- 2 # Adjust padding to your preference
x_limits <- c(points_resurvey_extent$lon_min - padding,
points_resurvey_extent$lon_max + padding)
y_limits <- c(points_resurvey_extent$lat_min - padding,
points_resurvey_extent$lat_max + padding)
# Create the zoomed map
ggplot() +
geom_sf(data = world, fill = "lightblue", color = "gray") +
geom_point(data = data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ),
aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
size = 1) +
coord_sf(xlim = x_limits, ylim = y_limits) +
theme_minimal()
Number of ReSurvey points by Country:
data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
count(Country)
AlpineGrasslands_indices <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrassland_Sentinel_Plot_Allyear_Allmetrics.csv")
AlpineGrasslands_phen <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
AlpineGrasslands_CH <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_CanopyHeight_1m.csv")
VegetationTypes_indices <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Sentinel_Plot_AllYear_Allmetrics.csv")
VegetationTypes_phen <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
VegetationTypes_CH <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_CanopyHeight_1m.csv")
AlpineGrasslands <- AlpineGrasslands_indices %>%
select(-`system:index`, -.geo, -Localidad) %>%
rename(Hábitat = "H�bitat") %>%
full_join(AlpineGrasslands_phen %>%
select(-`system:index`, -.geo, -Localidad) %>%
rename(Hábitat = "H�bitat")) %>%
full_join(AlpineGrasslands_CH %>%
select(-`system:index`, -.geo, -Localidad)) %>%
select(-Date__year, - `Precisi�n`) %>%
mutate(DATE = ymd(DATE)) %>%
rename(ID = "Releve_num") %>%
mutate(ID = as.character(ID)) %>%
mutate(layer = "AlpineGrasslands")
VegetationTypes <- VegetationTypes_indices %>%
select(-`system:index`, -.geo) %>%
full_join(VegetationTypes_phen %>%
select(-`system:index`, -.geo)) %>%
full_join(VegetationTypes_CH %>%
select(-`system:index`, -.geo)) %>%
rename(Hábitat = "TYPE") %>%
mutate(layer = "VegetationTypes")
Merge both datasets:
cordillera <- bind_rows(
AlpineGrasslands %>% select(DATE, ID, starts_with("NDMI"),
starts_with("NDVI"), Hábitat, "EOS_DOY",
"Peak_DOY", "SOS_DOY", "Season_Length",
"canopy_height", "layer"),
VegetationTypes %>% select(DATE, ID, starts_with("NDMI"),
starts_with("NDVI"), Hábitat, "EOS_DOY",
"Peak_DOY", "SOS_DOY", "Season_Length",
"canopy_height", "layer")
) %>%
mutate(EUNISa_1 = case_when(
Hábitat = str_detect(Hábitat, "Pastizal|Cervunal|grassland|meadow") ~ "R",
Hábitat = str_detect(Hábitat, "forest") ~ "T",
Hábitat = str_detect(Hábitat, "Scrub|scrub|Shrubland|shrubland|shrub|Heathland") ~ "S",
Hábitat = str_detect(Hábitat, "Suelo|Scree|scree|cliff") ~ "U",
Hábitat = is.na(Hábitat) ~ "R",
TRUE ~ NA_character_),
EUNISa_1_descr = case_when(
EUNISa_1 == "R" ~ "Grasslands",
EUNISa_1 == "T" ~ "Forests and other wooded land",
EUNISa_1 == "S" ~ "Heathlands, scrub and tundra",
EUNISa_1 == "U" ~ "Inland habitats with no or little soil")
)
distr_plot(cordillera,
c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"),
c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
distr_plot(cordillera,
c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"),
c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
sessionInfo()